library(tidyverse)
[37m── [1mAttaching packages[22m ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[37m[32m✔[37m [34mggplot2[37m 2.2.1 [32m✔[37m [34mpurrr [37m 0.2.4
[32m✔[37m [34mtibble [37m 1.4.2 [32m✔[37m [34mdplyr [37m 0.7.4
[32m✔[37m [34mtidyr [37m 0.8.0 [32m✔[37m [34mstringr[37m 1.3.0
[32m✔[37m [34mreadr [37m 1.1.1 [32m✔[37m [34mforcats[37m 0.3.0[39m
[37m── [1mConflicts[22m ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31m✖[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
library(cowplot)
Attaching package: ‘cowplot’
The following object is masked from ‘package:ggplot2’:
ggsave
## Define data paths
in_path_varscan = 'data/maf/6c93f518-1956-4435-9806-37185266d248/TCGA.BRCA.varscan.6c93f518-1956-4435-9806-37185266d248.DR-10.0.somatic.maf.gz'
in_path_muse = 'data/maf/b8ca5856-9819-459c-87c5-94e91aca4032/TCGA.BRCA.muse.b8ca5856-9819-459c-87c5-94e91aca4032.DR-10.0.somatic.maf.gz'
in_path_ss = 'data/maf/7dd592e3-5950-4438-96d5-3c718aca3f13/TCGA.BRCA.somaticsniper.7dd592e3-5950-4438-96d5-3c718aca3f13.DR-10.0.somatic.maf.gz'
in_path_mutect = 'data/maf/995c0111-d90b-4140-bee7-3845436c3b42/TCGA.BRCA.mutect.995c0111-d90b-4140-bee7-3845436c3b42.DR-10.0.somatic.maf.gz'
df = read_tsv(
in_path_mutect,
comment = '#'
)
print(dim(df))
[1] 120988 120
head(df, 10)
## Define column types
col_types = cols_only(
Chromosome = 'c',
Start_Position = 'i',
End_Position = 'i',
SYMBOL = 'c',
Reference_Allele = 'c',
Allele = 'c',
Variant_Classification = 'c',
IMPACT = 'c',
Variant_Type = 'c',
Tumor_Sample_Barcode = 'c'
)
## Concatenate vertically
in_paths = c(
in_path_varscan,
in_path_muse,
in_path_ss,
in_path_mutect
)
df = bind_rows(lapply(
in_paths,
read_tsv,
col_types = col_types,
comment = '#'
))
## Convert characters to factors
# Note, if we weren't concatenating,
# this could be done while reading
# by specifying
# `COL_NAME = col_factor(levels=NULL)`
# inside cols_only()
df = mutate_if(
df,
is.character,
as.factor
)
print(dim(df))
[1] 370461 10
head(df)
## Rename
col_name_map = list(
CHR = 'Chromosome',
START = 'Start_Position',
END = 'End_Position',
GENE = 'SYMBOL',
REF = 'Reference_Allele',
ALT = 'Allele',
CLASS = 'Variant_Classification',
IMPACT = 'IMPACT',
TYPE = 'Variant_Type',
BARCODE = 'Tumor_Sample_Barcode'
)
df = rename(df, !!! col_name_map)
## Reorder
keep_cols = names(col_name_map)
df = select(df, keep_cols)
head(df)
levels(df$GENE) = c(levels(df$GENE), 'INTERGENIC')
df$GENE[is.na(df$GENE)] = 'INTERGENIC'
## New columns by assignment
df$SAMPLE = str_sub(df$BARCODE, 1, 12)
df$MUTATION = paste(
df$CHR,
df$GENE,
df$START,
df$END,
df$REF,
df$ALT,
sep = ':'
)
df$TYPE2 = ifelse(df$TYPE == 'SNP', 'SNP', 'INDEL')
## New columns with mutate()
# Equivalent to above (and thus redundant)
df = mutate(
df,
SAMPLE = str_sub(BARCODE, 1, 12),
MUTATION = paste(
CHR,
GENE,
START,
END,
REF,
ALT,
sep = ':'
),
TYPE2 = ifelse(TYPE == 'SNP', 'SNP', 'INDEL')
)
## Convert new columns to factors if applicable,
## possibly with manually-set levels
df$SAMPLE = as.factor(df$SAMPLE)
df$MUTATION = as.factor(df$MUTATION)
df$TYPE2 = factor(df$TYPE2, levels = c('SNP', 'INDEL'))
print(dim(df))
[1] 370461 13
head(df)
## Remove duplicate rows
df = unique(df)
## Remove duplicate based on column
df_mut = df[!duplicated(df$MUTATION),]
print(dim(df))
[1] 132916 13
print(dim(df_mut))
[1] 130625 13
drop_cols = c('BARCODE', 'SAMPLE')
df_mut = select(df_mut, -one_of(drop_cols))
print(dim(df_mut))
[1] 130625 11
head(df_mut)
## Create dummy frame
df_tmp = df
drop_cols = c(
'CHR',
'GENE',
'START',
'END',
'REF',
'ALT'
)
df_tmp = select(df_tmp, -one_of(drop_cols))
## Create new columns
df_tmp = separate(
df_tmp,
MUTATION,
drop_cols,
':'
)
print(dim(df_tmp))
[1] 132916 12
head(df_tmp)
write_delim(
df,
'data/out/df_r.tsv',
delim = '\t'
)
rds_path = 'data/out/df.rds'
## Write
saveRDS(df, rds_path)
## Read
df = readRDS(rds_path)
## Shorten to relevant columns
# Note, we could include other columns here
# like GENE and IMPACT, but we won't
# to match the python example.
## Create a dummy column w/ fill value
df_tmp = select(df, c('SAMPLE', 'MUTATION'))
df_tmp$EXISTS = 1
## Long-to-wide
df_wide = spread(
df_tmp,
key = 'SAMPLE',
value = 'EXISTS',
fill = 0
)
rm(df_tmp)
print(dim(df_wide))
[1] 130625 987
head(df_wide)
## Select columns to add on and col to join on
cols = c('MUTATION', 'GENE', 'IMPACT')
## Merge
df_wide = merge(
df_wide, select(df_mut, cols),
by = 'MUTATION',
all.x = TRUE
)
## Re-order columns
# Put new columns first
df_wide = select(df_wide, cols, everything())
# Note, first 3 columns are out of order
# w/ python example.
print(dim(df_wide))
[1] 130625 989
head(df_wide)
# Set value and id columns
value_cols = grep('TCGA-', names(df_wide), value = TRUE)
# Melt
df_long = gather(
df_wide,
key = 'SAMPLE',
value = 'EXISTS',
value_cols
)
# Clean up
df_long = filter(df_long, EXISTS != 0)
df_long = select(df_long, -EXISTS)
print(dim(df_long))
[1] 132916 4
head(df_long)
rm(df_wide, df_long)
n_genes = length(unique(df$GENE))
n_samples = length(unique(df$SAMPLE))
n_mutations = length(unique(df$MUTATION))
n_mutation_classes = length(unique(df$CLASS))
print('Number of unique')
[1] "Number of unique"
cat('\tSamples: ', n_samples, '\n')
Samples: 986
cat('\tGenes: ', n_genes, '\n')
Genes: 19168
cat('\tMutations: ', n_mutations, '\n')
Mutations: 130625
cat('\tMutation Classes: ', n_mutation_classes, '\n')
Mutation Classes: 18
class_counts = sort(summary(df$CLASS), decreasing = TRUE)
impact_counts = sort(summary(df$IMPACT), decreasing = TRUE)
type_counts = sort(summary(df$TYPE), decreasing = TRUE)
print('Counts per CLASS:')
[1] "Counts per CLASS:"
print(class_counts)
Missense_Mutation Silent 3'UTR Intron
66371 23881 11052 6990
Nonsense_Mutation Frame_Shift_Del 5'UTR RNA
6056 3661 3392 2543
Frame_Shift_Ins Splice_Site Splice_Region 3'Flank
1975 1921 1537 1132
5'Flank In_Frame_Del In_Frame_Ins Nonstop_Mutation
906 869 433 93
Translation_Start_Site IGR
90 14
print('Counts per IMPACT:')
[1] "Counts per IMPACT:"
print(impact_counts)
MODERATE MODIFIER LOW HIGH
67648 26059 25413 13796
print('Counts per TYPE:')
[1] "Counts per TYPE:"
print(type_counts)
SNP DEL INS
121319 7316 4281
## Display some instances
# Need to override the maxsum = 50 arg
mut_counts = sort(
summary(df$MUTATION, maxsum = n_mutations),
decreasing = TRUE)
print('Top repeated (>10) mutations:')
[1] "Top repeated (>10) mutations:"
print(mut_counts[mut_counts > 10])
chr3:PIK3CA:179234297:179234297:A:G chr3:PIK3CA:179218303:179218303:G:A
121 63
chr3:PIK3CA:179218294:179218294:G:A chr1:ST6GALNAC3:76576946:76576947:-:AAAC
43 33
chr14:AKT1:104780214:104780214:C:T chr10:GATA3:8069470:8069471:CA:-
25 21
chr3:MUC4:195783009:195783009:C:T chr17:TP53:7675088:7675088:C:T
21 20
chr3:MUC4:195783008:195783008:A:G chr15:GOLGA6L6:20535018:20535018:C:T
20 17
chr3:PIK3CA:179203765:179203765:T:A chr3:PIK3CA:179234297:179234297:A:T
17 13
chr6:OPRM1:154107953:154107958:TTTTTA:- chr17:TP53:7673802:7673802:C:T
13 12
chr16:PKD1P6:15104542:15104542:T:A chr1:NBPF12:146963189:146963189:G:C
11 11
sample_counts = sort(
summary(df$SAMPLE, maxsum = n_samples),
decreasing = TRUE)
print('Samples with most mutations:')
[1] "Samples with most mutations:"
print(sample_counts[0:10])
TCGA-AN-A046 TCGA-AC-A23H TCGA-5L-AAT1 TCGA-BH-A18G TCGA-AN-A0AK TCGA-A8-A09Z TCGA-BH-A0HF
7948 6711 2117 2033 2029 1924 1695
TCGA-AO-A128 TCGA-D8-A1XK TCGA-BH-A0B6
1644 1541 1419
print('Samples with least mutations:')
[1] "Samples with least mutations:"
print(tail(sample_counts, 10))
TCGA-A2-A0ES TCGA-AC-A2FB TCGA-AR-A24W TCGA-AR-A252 TCGA-LL-A440 TCGA-AO-A1KO TCGA-A2-A1G6
18 17 16 16 16 15 12
TCGA-A8-A08C TCGA-A2-A25F TCGA-AC-A2FK
9 7 2
df_summary = df %>%
group_by(GENE) %>%
summarise(
N_MUTATIONS = n(),
N_UNIQUE_MUTATIONS = n_distinct(MUTATION),
N_SAMPLES = n_distinct(BARCODE)
)
head(df_summary)
df_summary = df_summary %>%
arrange(desc(N_SAMPLES), N_MUTATIONS)
head(df_summary)
df_impact = df %>%
group_by(GENE, IMPACT) %>%
summarise(COUNT = n_distinct(MUTATION))
print(dim(df_impact))
[1] 46692 3
head(df_impact)
## Make counts per group
df_sample_counts = df %>%
group_by(SAMPLE, TYPE2, IMPACT) %>%
summarize(N_MUTATIONS = n()) %>%
filter(N_MUTATIONS >= 20, N_MUTATIONS <= 200)
## Calculate stats
df_sample_stats = df_sample_counts %>%
group_by(TYPE2, IMPACT) %>%
summarize(
MEDIAN = median(N_MUTATIONS),
UPPER = quantile(N_MUTATIONS, .75),
LOWER = quantile(N_MUTATIONS, .25),
MEAN = mean(N_MUTATIONS),
STD = sd(N_MUTATIONS),
MIN = min(N_MUTATIONS),
MAX = max(N_MUTATIONS)
)
head(df_sample_counts)
df_sample_stats
Use cowplot for nice theme and subplotting https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html
img_dir = 'img/'
NOTE: shape = ‘4’ corresponds to X’s
x = 0:(n_samples-1)
y = unname(sample_counts)
p = ggplot(mapping = aes(x, y)) +
geom_line() +
geom_point(shape = 4) +
ggtitle('Mutations per Individual') +
xlab('Individual Rank') +
ylab('Number of Mutations')
p
ggsave(paste0(img_dir, 'rank_r.png'))
Saving 5.28 x 3.26 in image
p = ggplot(
mapping = aes(names(class_counts), unname(class_counts))
) +
geom_bar(stat = 'identity') +
scale_x_discrete(limits = rev(names(class_counts))) +
coord_flip()
p
ggsave(paste0(img_dir, 'bar_r.png'))
Saving 5.28 x 3.26 in image
## First plot
# Reuse instance from above
p1 = p
## Second plot
n_genes_to_plot = 5
genes = df_summary$GENE[0:n_genes_to_plot]
counts = df_summary$N_SAMPLES[0:n_genes_to_plot]
p2 = ggplot(mapping = aes(genes, counts)) +
geom_bar(stat = 'identity') +
scale_x_discrete(limits = genes)
p_grid = plot_grid(p1, p2, ncol = 2)
save_plot(paste0(img_dir, 'subplot_r.png'), p_grid, ncol = 2)
p_grid
## New df's for plotting
df_identity = df_summary %>%
arrange(-N_MUTATIONS)
df_count = df
## Plot params
n_genes_to_plot = 5
genes = df_identity$GENE[0:n_genes_to_plot]
counts = df_identity$N_MUTATIONS[0:n_genes_to_plot]
## Identity plot
p1 = ggplot(mapping = aes(genes, counts)) +
geom_bar(stat = 'identity') +
scale_x_discrete(limits = genes) +
ggtitle('# Mutations - Identity')
## Count plot
p2 = ggplot(df, aes(GENE)) +
geom_bar() +
scale_x_discrete(limits = genes) +
ggtitle('# Mutations - Count')
p_grid = plot_grid(p1, p2, ncol = 2)
Removed 131195 rows containing non-finite values (stat_count).
save_plot(paste0(img_dir, 'count_vs_identity_r.png'), p_grid, ncol = 2)
p_grid
## Plot params
n_genes_to_plot = 5
genes = df_summary$GENE[0:n_genes_to_plot]
## Filter to relvant data
df_plt = df_impact %>%
filter(GENE %in% genes)
## Reorder for plot
# This is an alternative to
# scale_x_discrete(limits=genes)
df_plt$GENE = factor(df_plt$GENE, levels = genes)
p = ggplot(data = df_plt) +
geom_bar(
aes(x=GENE, y=COUNT, fill=IMPACT),
stat = 'identity',
position = 'dodge'
) +
scale_fill_brewer(
type='qual',
palette = 'Set1'
) +
ggtitle('Impact of Mutations')
p
ggsave(paste0(img_dir, 'group_r.png'), p)
Saving 5.28 x 3.26 in image
## Get some samples and genes
samples = names(sample_counts[0:2])
genes = c('PIK3CA', 'TTN', 'MUC16')
## Filter to relevant data
df_plt = df %>%
filter(SAMPLE %in% samples & GENE %in% genes)
## Reorder
df_plt$GENE = factor(df_plt$GENE, levels = genes)
df_plt$SAMPLE = factor(df_plt$SAMPLE, levels = samples)
## Plot
p = ggplot(df_plt) +
geom_bar(
aes(x=GENE, fill=IMPACT),
position = 'dodge'
) +
scale_fill_brewer(
type='qual',
palette = 'Set1'
) +
facet_wrap('SAMPLE')
p
ggsave(paste0(img_dir, 'facet_r.png'), p)
Saving 5.28 x 3.26 in image
## Set order
df_sample_counts$TYPE2 = factor(df_sample_counts$TYPE2, levels = c('INDEL', 'SNP'))
df_sample_counts$IMPACT = factor(df_sample_counts$IMPACT, levels = c(
'HIGH', 'MODERATE', 'LOW', 'MODIFIER'))
## Box plot
p = ggplot(df_sample_counts, aes(TYPE2, N_MUTATIONS, fill = IMPACT)) +
geom_boxplot() +
coord_flip() +
scale_fill_brewer(
type='qual',
palette = 'Set1'
) +
ggtitle('Mutations Per Sample') +
xlab('Type')
p
ggsave(paste0(img_dir, 'box_group_r.png'), p)
Saving 5.28 x 3.26 in image